Project 1

Bias Correction: Quantile Quantile Mapping

Module 2
Project
Author

Coleman Nickum (CN33)

Published

November 13, 2023

1 Setup

using Dates
using MultivariateStats
using Plots
using NCDatasets
using StatsBase
using Unitful
using DataFrames
using Statistics

Plots.default(; margin=4Plots.mm, size=(700, 400), linewidth=2)

2 Precipitation Data: ERA5 Reanalysis Dataset

# Dataset covers daily precipitation (mm) from 1/1/1979 to 12/31/2020
ds_precip = NCDataset("/Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/precip.nc")
println(ds_precip)
Dataset: /Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/precip.nc
Group: /

Dimensions
   lon = 10
   lat = 10
   time = 15341

Variables
  lon   (10)
    Datatype:    Union{Missing, Float32} (Float32)
    Dimensions:  lon
    Attributes:
     _FillValue           = NaN
     long_name            = Longitude
     units                = degrees_east
     axis                 = X
     standard_name        = longitude
     actual_range         = Float32[0.25, 359.75]
     coordinate_defines   = center

  lat   (10)
    Datatype:    Union{Missing, Float32} (Float32)
    Dimensions:  lat
    Attributes:
     _FillValue           = NaN
     actual_range         = Float32[89.75, -89.75]
     long_name            = Latitude
     units                = degrees_north
     axis                 = Y
     standard_name        = latitude
     coordinate_defines   = center

  time   (15341)
    Datatype:    Union{Missing, DateTime} (Float64)
    Dimensions:  time
    Attributes:
     _FillValue           = NaN
     long_name            = Time
     axis                 = T
     standard_name        = time
     coordinate_defines   = start
     actual_range         = [692496.0, 701232.0]
     delta_t              = 0000-00-01 00:00:00
     avg_period           = 0000-00-01 00:00:00
     _ChunkSizes          = 1
     units                = hours since 1900-01-01
     calendar             = proleptic_gregorian

  precip   (10 × 10 × 15341)
    Datatype:    Union{Missing, Float32} (Float32)
    Dimensions:  lon × lat × time
    Attributes:
     _FillValue           = -9.96921e36
     var_desc             = Precipitation
     level_desc           = Surface
     statistic            = Total
     parent_stat          = Other
     long_name            = Daily total of precipitation
     cell_methods         = time: sum
     avg_period           = 0000-00-01 00:00:00
     actual_range         = Float32[0.0, 428.02423]
     units                = mm
     valid_range          = Float32[0.0, 1000.0]
     dataset              = CPC Global Precipitation
     _ChunkSizes          = Int32[1, 360, 720]
     missing_value        = -9.96921e36

precip_time = ds_precip["time"][:];
precip_lon = ds_precip["lon"][:];
precip_lat = ds_precip["lat"][:];
precip = ds_precip["precip"][:,:,:];
precip = precip .* 1u"mm";
precip_lat = reverse(precip_lat)
precip = reverse(precip, dims=2);

2.1 Precipitation Heatmap: Past vs Present

h1 = heatmap(
    precip_lon,
    precip_lat,
    precip[:, :, 1]';
    xlabel="Longitude",
    ylabel="Latitude",
    title="Precipitation on 1/1/1979"
)

h2 = heatmap(
    precip_lon,
    precip_lat,
    precip[:, :, end]';
    xlabel="Longitude",
    ylabel="Latitude",
    title="Precipitation on 12/31/2020"
)
plot(h1, h2; layout=(1, 2), size=(900, 400))
close(ds_precip)
closed Dataset

3 Temperature Data: ERA5 Reanalysis Dataset

# Dataset covers daily temperature (Kelvin) from 1/1/1979 to 12/31/2020
ds_temp = NCDataset("/Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/temp.nc")
println(ds_temp)
Dataset: /Users/coleman/Documents/GitHub/project-01-precipitation-downscaling-ColemanNickum/data/raw/temp.nc
Group: /

Dimensions
   longitude = 21
   latitude = 33
   time = 15341

Variables
  longitude   (21)
    Datatype:    Union{Missing, Float64} (Float64)
    Dimensions:  longitude
    Attributes:
     _FillValue           = NaN

  latitude   (33)
    Datatype:    Union{Missing, Float32} (Float32)
    Dimensions:  latitude
    Attributes:
     _FillValue           = NaN
     units                = degrees_north
     long_name            = latitude

  time   (15341)
    Datatype:    DateTime (Int64)
    Dimensions:  time
    Attributes:
     units                = days since 1979-01-01 00:00:00
     calendar             = proleptic_gregorian

  t2m   (21 × 33 × 15341)
    Datatype:    Union{Missing, Float32} (Float32)
    Dimensions:  longitude × latitude × time
    Attributes:
     _FillValue           = NaN
     units                = K
     long_name            = 2 metre temperature

temp_time = ds_temp["time"][:];
temp_lon = ds_temp["longitude"][:];
temp_lat = ds_temp["latitude"][:];
temp = ds_temp["t2m"][:,:,:];
temp_lat = reverse(temp_lat)
temp = reverse(temp, dims=2);

3.1 Temperature Heatmap: Past vs Present

h1 = heatmap(
    temp_lon,
    temp_lat,
    temp[:, :, 1]';
    xlabel="Longitude",
    ylabel="Latitude",
    title="Temperature on 1/1/1979"
)

h2 = heatmap(
    temp_lon,
    temp_lat,
    temp[:, :, end]';
    xlabel="Longitude",
    ylabel="Latitude",
    title="Temperature on 12/31/2020"
)
plot(h1, h2; layout=(1, 2), size=(900, 400))

3.2 Ensuring temperature and precipitation correspond to the same time period

@assert temp_time == precip_time
time_data = Dates.Date.(temp_time)
15341-element Vector{Date}:
 1979-01-01
 1979-01-02
 1979-01-03
 1979-01-04
 1979-01-05
 1979-01-06
 1979-01-07
 1979-01-08
 1979-01-09
 1979-01-10
 1979-01-11
 1979-01-12
 1979-01-13
 ⋮
 2020-12-20
 2020-12-21
 2020-12-22
 2020-12-23
 2020-12-24
 2020-12-25
 2020-12-26
 2020-12-27
 2020-12-28
 2020-12-29
 2020-12-30
 2020-12-31

4 Splitting Data: Training Period vs Testing Period

# Performing a typical split ratio (roughly 70:30) for training and testing data collected from 1979 to 2021. With 42 years of data, the training dataset will encompass 30 years and the testing period will be 12 years (2009 to 2021)
split_idx = findfirst(time_data .== time_data[end] - Dates.Year(10))
train_idx = 1:split_idx
test_idx = (split_idx+1):length(time_data);

4.1 Testing and Training Period Variables

precip_train = precip[:, :, train_idx];
precip_test = precip[:, :, test_idx];
temp_train = temp[:, :, train_idx];
temp_test = temp[:, :, test_idx];

5 Preprocessing Data: Climatology and Mean Centering

function preprocess(temp::Array{T, 3}, temp_reference::Array{T, 3})::AbstractMatrix where T
#Computing anomalies that removes climatology from our matrix to allow greater clarity of temperature variations by removing the influence of climate patterns
climatology = mean(temp_reference, dims=3)
anomalies = temp .- climatology

#Reshaping our temperature dataset to produce a 2D matrix of time (rows) and locations (columns): 
temp_mat = reshape(anomalies, size(temp, 1) * size(temp, 2), size(temp, 3))
    return temp_mat
end
preprocess (generic function with 1 method)

5.1 Apply to Training and Testing Datasets

#Preprocessing temp_train and temp_test; both are preprocessed to the temp training data so they both use the same climatology
n_lon, n_lat, n_t = size(temp)
temp_training_matrix = preprocess(temp_train, temp_train);
temp_testing_matrix = preprocess(temp_test, temp_train);

6 Principle Component Analysis

6.1 Fitting

#Fitting a PCA model to training period to automatically choose the number of principle components
PCA_model = fit(PCA, temp_training_matrix; maxoutdim=10, pratio=0.999);

6.2 Plotting Variance Explained by PCs

# Variables for plotting the variance accounted by the principle components 
variance_explained = principalvars(PCA_model)
total_var = var(PCA_model)
cumulative_var = cumsum(variance_explained)./total_var
10-element Vector{Float32}:
 0.9599857
 0.98082733
 0.98715866
 0.9929077
 0.99447197
 0.9957567
 0.99657375
 0.9971035
 0.9975621
 0.99789464
p1 = plot(
    variance_explained / total_var;
    xlabel="Number of PCs",
    ylabel="Fraction of Variance Explained",
    label=false,
    title="Variance Explained"
)
p2 = plot(
    cumulative_var;
    xlabel="Number of PCs",
    ylabel="Fraction of Variance Explained",
    label=false,
    title="Cumulative Variance Explained"
)
plot(p1, p2; layout=(1, 2), size=(900, 400))
Note

I chose to select 4 principle components because, after plotting the variance explained, the 4 principle components accounted for a cumulative variance of 0.95 and retains enough information while also reducing the noise of outiers.

    pc = projection(PCA_model)[:, 3];

6.3 Plotting Spatial Patterns

p = []
for i in 1:4
    pc = projection(PCA_model)[:, i]
    pc_reshaped = reshape(pc, n_lon, n_lat)'
    pi = heatmap(
        temp_lon,
        temp_lat,
        pc_reshaped;
        xlabel="Longitude",
        ylabel="Latitude",
        title="PC $i",
        aspect_ratio=:equal,
        cmap=:PuOr
    )
    push!(p, pi)
end
plot(p...; layout=(1, 4), size=(1500, 600))

6.4 Plotting Time Series

pc_ts = predict(PCA_model, temp_training_matrix)
Months = Dates.month.(time_data)
custom_xticks = [1, 3, 6, 9, 12]
p = []
for i in 1:4
    pi = scatter(
        Months,
        pc_ts[i, :];
        xlabel="Months in Year",
        ylabel="PC $i",
        title="PC $i",
        label=false,
        alpha=0.3,
        color=:blue,
        xticks=(custom_xticks, custom_xticks)
    )
    push!(p, pi)
end
plot(p...; layout=(1, 4), size=(1800, 600))

7 Scatter Plots: Comparison of Influence on Rainfall

avg_precip =
    ustrip.(
        u"inch", [mean(skipmissing(precip_train[:, :, t])) for t in 1:size(precip_train, 3)]
    )
avg_precip = replace(avg_precip, NaN => 0)

p1_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p1 = scatter(
    pc_ts[2, p1_idx],
    pc_ts[3, p1_idx];
    zcolor=avg_precip[p1_idx],
    xlabel="PC 2",
    ylabel="PC 3",
    markersize=5,
    clims=(0, 2.75),
    title="Rainy Days",
    label=false
)

p2_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p2 = scatter(
    pc_ts[4, p2_idx],
    pc_ts[3, p2_idx];
    zcolor=avg_precip[p2_idx],
    xlabel="PC 4",
    ylabel="PC 3",
    markersize=5,
    clims=(0, 2.75),
    title="Rainy Days",
    label=false
)

p3_idx = findall(avg_precip .> quantile(avg_precip, 0.98))
p3 = scatter(
    pc_ts[4, p3_idx],
    pc_ts[2, p3_idx];
    zcolor=avg_precip[p3_idx],
    xlabel="PC 4",
    ylabel="PC 2",
    markersize=5,
    clims=(0, 2.75),
    title="Rainy Days",
    label=false
)
plot(p1,p2, p3; size=(1000, 600), link=:both)

8 K-NN: Resampling Algorithm

function euclidean_distance(x::AbstractVector, y::AbstractVector)::AbstractFloat
    return sqrt(sum((x .- y) .^ 2))
end

function nsmallest(x::AbstractVector, n::Int)::Vector{Int}
    idx = sortperm(x)
    return idx[1:n]
end

function knn(X::AbstractMatrix, X_i::AbstractVector, K::Int)::Tuple{Int,AbstractVector}
    # calculate the distances between X_i and each row of X
    dist = [euclidean_distance(X_i, X[j, :]) for j in 1:size(X, 1)]
    idx = nsmallest(dist, K)
    w = 1 ./ dist[idx]
    w ./= sum(w)
    idx_sample = sample(idx, Weights(w))
    return (idx_sample, vec(X[idx_sample, :]))
end
knn (generic function with 1 method)

8.1 Combining KNN and PCA

"""
KNN resampling algorithm

temp_train: the training temperature data. This should be the raw field, not the prinipal components. Inside the function, convert to principal components using `n_pca` principal components. 
temp_test: the temperature data to predict. This should be the raw field, not the prinipal components. Inside the function, convert to principal components using `n_pca` principal components.
precip_train: the training precipitation data.
"""
function predict_knn(temp_train, temp_test, precip_train; n_pca::Int)
    X_train = preprocess(temp_train, temp_train)
    X_test = preprocess(temp_test, temp_train)

    # fit the PCA model to the training data
    pca_model = fit(PCA, X_train; maxoutdim=n_pca)

    # project the test data onto the PCA basis
    train_embedded = predict(pca_model, X_train)
    test_embedded = predict(pca_model, X_test)

    # use the `knn` function for each point in the test data
    precip_pred = map(1:size(X_test, 2)) do i
        idx, _ = knn(train_embedded', test_embedded[:, i], 3)
        precip_train[:, :, idx]
    end

    # return a matrix of predictions
    return precip_pred
end
predict_knn

8.2 Predicted vs. Actual Values

t_sample = rand(1:size(temp_test, 3), 4)
precip_pred = predict_knn(temp_train, temp_test[:, :, t_sample], precip_train; n_pca=4)

p = map(eachindex(t_sample)) do ti
    t = t_sample[ti]
    y_pred = precip_pred[ti]'
    y_actual = precip_test[:, :, t]'
    cmax = max(maximum(skipmissing(y_pred)), maximum(skipmissing(y_actual)))
    cmax = ustrip(u"mm", cmax)

    p1 = heatmap(
        precip_lon,
        precip_lat,
        y_pred;
        xlabel="Longitude",
        ylabel="Latitude",
        title="Predicted",
        aspect_ratio=:equal,
        clims=(0, cmax)
    )
    p2 = heatmap(
        precip_lon,
        precip_lat,
        y_actual;
        xlabel="Longitude",
        ylabel="Latitude",
        title="Actual",
        aspect_ratio=:equal,
        clims=(0, cmax)
    )
    plot(p1, p2; layout=(2, 1), size=(1000, 400))
end
plot(p...; layout=(2, 3), size=(1500, 1200))